Abstract
Este informe presenta un análisis exhaustivo del dataset de reservas hoteleras de dos hoteles en Portugal (City Hotel y Resort Hotel), abarcando el período 2015-2017 con 119,390 observaciones. El estudio aplica técnicas de analítica visual para explorar patrones de cancelación, comportamiento de reservas, estacionalidad y características de los huéspedes. Se identifican factores clave que influyen en las cancelaciones, diferencias significativas entre tipos de hotel, y tendencias temporales en la demanda. El análisis integra visualizaciones interactivas que permiten descubrir insights relevantes para la gestión hotelera, como la relación entre lead time y cancelaciones, el impacto del tipo de depósito, y las preferencias según origen geográfico. Los resultados proporcionan una base sólida para la construcción de una narrativa de datos orientada a la toma de decisiones estratégicas en el sector hotelero.
El dataset hotel_bookings.csv contiene información detallada sobre reservas realizadas en dos hoteles portugueses: un City Hotel ubicado en Lisboa y un Resort Hotel en el Algarve. Los datos abarcan el período de 2015 a 2017 y proporcionan una visión comprehensiva del comportamiento de las reservas hoteleras.
# Dimensiones del dataset
cat("Dimensiones del dataset:\n")
## Dimensiones del dataset:
cat("- Observaciones:", nrow(df_csv), "\n")
## - Observaciones: 119390
cat("- Variables:", ncol(df_csv), "\n\n")
## - Variables: 32
# Primeras filas
kable(head(df_csv, 5), caption = "Primeras 5 observaciones del dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
font_size = 10) %>%
scroll_box(width = "100%")
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | NULL | NULL | 0 | Transient | 0 | 0 | 0 | Check-Out | 2015-07-01 |
| Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | NULL | NULL | 0 | Transient | 0 | 0 | 0 | Check-Out | 2015-07-01 |
| Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | NULL | NULL | 0 | Transient | 75 | 0 | 0 | Check-Out | 2015-07-02 |
| Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304 | NULL | 0 | Transient | 75 | 0 | 0 | Check-Out | 2015-07-02 |
| Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240 | NULL | 0 | Transient | 98 | 0 | 1 | Check-Out | 2015-07-03 |
El dataset contiene 32 variables que describen diferentes aspectos de cada reserva:
# Crear tabla descriptiva de variables
var_desc <- data.frame(
Variable = names(df_csv),
Tipo = sapply(df_csv, class),
Valores_Unicos = sapply(df_csv, function(x) length(unique(x))),
Valores_NA = sapply(df_csv, function(x) sum(is.na(x)))
)
datatable(var_desc,
options = list(pageLength = 10, scrollX = TRUE),
caption = "Descripción de variables del dataset")
# Calcular porcentaje de valores faltantes
missing_data <- data.frame(
Variable = names(df_csv),
NA_Count = sapply(df_csv, function(x) sum(is.na(x))),
NA_Percentage = round(sapply(df_csv, function(x) sum(is.na(x))/length(x)*100), 2)
) %>%
filter(NA_Count > 0) %>%
arrange(desc(NA_Count))
if(nrow(missing_data) > 0) {
kable(missing_data, caption = "Variables con valores faltantes") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
# Visualización interactiva
p <- ggplot(missing_data, aes(x = reorder(Variable, NA_Percentage),
y = NA_Percentage)) +
geom_bar(stat = "identity", fill = "#FF6B6B") +
coord_flip() +
labs(title = "Porcentaje de Valores Faltantes por Variable",
x = "Variable", y = "Porcentaje (%)") +
theme_minimal()
ggplotly(p)
} else {
cat("No se detectaron valores faltantes en el dataset.\n")
}
# Variables numéricas para análisis de outliers
numeric_vars <- df_csv %>%
select_if(is.numeric) %>%
select(lead_time, stays_in_weekend_nights, stays_in_week_nights,
adults, children, babies, adr)
# Función para detectar outliers
detect_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
sum(x < lower | x > upper, na.rm = TRUE)
}
outliers_summary <- data.frame(
Variable = names(numeric_vars),
Outliers = sapply(numeric_vars, detect_outliers),
Porcentaje = round(sapply(numeric_vars, detect_outliers) / nrow(df_csv) * 100, 2)
)
kable(outliers_summary, caption = "Resumen de valores atípicos") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| Variable | Outliers | Porcentaje | |
|---|---|---|---|
| lead_time | lead_time | 3005 | 2.52 |
| stays_in_weekend_nights | stays_in_weekend_nights | 265 | 0.22 |
| stays_in_week_nights | stays_in_week_nights | 3354 | 2.81 |
| adults | adults | 29710 | 24.88 |
| children | children | 8590 | 7.19 |
| babies | babies | 917 | 0.77 |
| adr | adr | 3793 | 3.18 |
# Boxplots interactivos
plot_ly(data = df_csv, y = ~adr, type = "box", name = "ADR") %>%
layout(title = "Distribución de ADR (Average Daily Rate) - Detección de Outliers",
yaxis = list(title = "ADR (€)"))
# Crear variables adicionales para el análisis
df_csv <- df_csv %>%
mutate(
# Total de noches
total_nights = stays_in_weekend_nights + stays_in_week_nights,
# Total de huéspedes
total_guests = adults + children + babies,
# Fecha de llegada (convertir mes a número primero)
month_num = match(as.character(arrival_date_month), month.name),
arrival_date = as.Date(paste(arrival_date_year,
month_num,
arrival_date_day_of_month, sep = "-"),
format = "%Y-%m-%d"),
# Temporada
season = case_when(
arrival_date_month %in% c("December", "January", "February") ~ "Invierno",
arrival_date_month %in% c("March", "April", "May") ~ "Primavera",
arrival_date_month %in% c("June", "July", "August") ~ "Verano",
TRUE ~ "Otoño"
),
# Categoría de lead time
lead_time_category = case_when(
lead_time == 0 ~ "Mismo día",
lead_time <= 7 ~ "1 semana",
lead_time <= 30 ~ "1 mes",
lead_time <= 90 ~ "3 meses",
lead_time <= 180 ~ "6 meses",
TRUE ~ "Más de 6 meses"
),
# Cambio de habitación (convertir a character para comparar)
room_changed = ifelse(as.character(reserved_room_type) != as.character(assigned_room_type), 1, 0),
# Ingresos totales
total_revenue = adr * total_nights
) %>%
select(-month_num) # Eliminar variable temporal
cat("Variables derivadas creadas exitosamente.\n")
## Variables derivadas creadas exitosamente.
cat("Nuevas dimensiones:", dim(df_csv), "\n")
## Nuevas dimensiones: 119390 39
# Resumen de cancelaciones
cancel_summary <- df_csv %>%
group_by(hotel, is_canceled) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(hotel) %>%
mutate(percentage = round(count / sum(count) * 100, 2))
# Visualización interactiva
p1 <- ggplot(cancel_summary, aes(x = hotel, y = count,
fill = factor(is_canceled))) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = c("#4ECDC4", "#FF6B6B"),
labels = c("No cancelada", "Cancelada")) +
labs(title = "Tasa de Cancelación por Tipo de Hotel",
x = "Tipo de Hotel", y = "Porcentaje", fill = "Estado") +
theme_minimal()
ggplotly(p1)
# Tabla resumen
kable(cancel_summary %>%
pivot_wider(names_from = is_canceled, values_from = c(count, percentage)),
caption = "Resumen de cancelaciones por hotel") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| hotel | count_0 | count_1 | percentage_0 | percentage_1 |
|---|---|---|---|---|
| City Hotel | 46228 | 33102 | 58.27 | 41.73 |
| Resort Hotel | 28938 | 11122 | 72.24 | 27.76 |
# Lead time vs cancelación
p2 <- df_csv %>%
group_by(lead_time_category, is_canceled) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(lead_time_category) %>%
mutate(percentage = count / sum(count) * 100) %>%
filter(is_canceled == 1) %>%
ggplot(aes(x = reorder(lead_time_category, percentage),
y = percentage, fill = percentage)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "#FFE66D", high = "#FF6B6B") +
coord_flip() +
labs(title = "Tasa de Cancelación según Lead Time",
x = "Categoría de Lead Time", y = "% Cancelaciones") +
theme_minimal() +
theme(legend.position = "none")
ggplotly(p2)
# Deposit type vs cancelación
p3 <- df_csv %>%
group_by(deposit_type, is_canceled) %>%
summarise(count = n(), .groups = "drop") %>%
ggplot(aes(x = deposit_type, y = count, fill = factor(is_canceled))) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("#4ECDC4", "#FF6B6B"),
labels = c("No cancelada", "Cancelada")) +
labs(title = "Cancelaciones según Tipo de Depósito",
x = "Tipo de Depósito", y = "Número de Reservas", fill = "Estado") +
theme_minimal()
ggplotly(p3)
# Reservas por mes y año
monthly_bookings <- df_csv %>%
filter(!is.na(arrival_date)) %>%
mutate(year_month = format(arrival_date, "%Y-%m")) %>%
group_by(year_month, hotel) %>%
summarise(count = n(), .groups = "drop")
p4 <- ggplot(monthly_bookings, aes(x = year_month, y = count,
color = hotel, group = hotel)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_color_manual(values = c("#FF6B6B", "#4ECDC4")) +
labs(title = "Evolución Temporal de Reservas por Hotel",
x = "Mes-Año", y = "Número de Reservas", color = "Hotel") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p4)
# Estacionalidad
seasonal_data <- df_csv %>%
group_by(season, hotel) %>%
summarise(
total_bookings = n(),
avg_adr = mean(adr, na.rm = TRUE),
cancellation_rate = mean(is_canceled) * 100,
.groups = "drop"
)
p5 <- plot_ly(seasonal_data, x = ~season, y = ~total_bookings,
color = ~hotel, type = "bar") %>%
layout(title = "Reservas por Temporada y Hotel",
xaxis = list(title = "Temporada"),
yaxis = list(title = "Número de Reservas"),
barmode = "group")
p5
# Distribución de ADR
p6 <- plot_ly(df_csv %>% filter(adr > 0 & adr < 500),
x = ~adr, color = ~hotel, type = "histogram",
colors = c("#FF6B6B", "#4ECDC4")) %>%
layout(title = "Distribución de ADR por Hotel",
xaxis = list(title = "ADR (€)"),
yaxis = list(title = "Frecuencia"),
barmode = "overlay")
p6
# ADR promedio por mes
adr_monthly <- df_csv %>%
filter(adr > 0, !is.na(arrival_date)) %>%
mutate(month = format(arrival_date, "%B")) %>%
group_by(month, hotel) %>%
summarise(avg_adr = mean(adr, na.rm = TRUE), .groups = "drop")
# Ordenar meses
month_order <- c("January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December")
adr_monthly$month <- factor(adr_monthly$month, levels = month_order)
p7 <- ggplot(adr_monthly, aes(x = month, y = avg_adr,
fill = hotel, group = hotel)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("#FF6B6B", "#4ECDC4")) +
labs(title = "ADR Promedio por Mes",
x = "Mes", y = "ADR Promedio (€)", fill = "Hotel") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p7)
# Top 15 países
top_countries <- df_csv %>%
filter(country != "NULL") %>%
group_by(country) %>%
summarise(
total_bookings = n(),
cancellation_rate = mean(is_canceled) * 100,
avg_adr = mean(adr, na.rm = TRUE)
) %>%
arrange(desc(total_bookings)) %>%
head(15)
p8 <- plot_ly(top_countries, x = ~reorder(country, total_bookings),
y = ~total_bookings, type = "bar",
marker = list(color = ~cancellation_rate,
colorscale = "Reds",
showscale = TRUE,
colorbar = list(title = "% Cancel."))) %>%
layout(title = "Top 15 Países por Número de Reservas",
xaxis = list(title = "País"),
yaxis = list(title = "Número de Reservas"))
p8
# Tabla de países
datatable(top_countries,
options = list(pageLength = 15, scrollX = TRUE),
caption = "Top 15 países por reservas") %>%
formatRound(columns = c("cancellation_rate", "avg_adr"), digits = 2)
# Distribución de segmentos
market_data <- df_csv %>%
group_by(market_segment, hotel) %>%
summarise(
count = n(),
avg_adr = mean(adr, na.rm = TRUE),
cancellation_rate = mean(is_canceled) * 100,
.groups = "drop"
)
p9 <- plot_ly(market_data, x = ~market_segment, y = ~count,
color = ~hotel, type = "bar") %>%
layout(title = "Distribución de Reservas por Segmento de Mercado",
xaxis = list(title = "Segmento"),
yaxis = list(title = "Número de Reservas"),
barmode = "stack")
p9
# Tabla resumen por segmento
kable(market_data %>%
arrange(desc(count)),
caption = "Resumen por segmento de mercado",
digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| market_segment | hotel | count | avg_adr | cancellation_rate |
|---|---|---|---|---|
| Online TA | City Hotel | 38748 | 118.92 | 37.40 |
| Online TA | Resort Hotel | 17729 | 113.43 | 35.24 |
| Offline TA/TO | City Hotel | 16747 | 93.02 | 42.83 |
| Groups | City Hotel | 13975 | 84.92 | 68.86 |
| Offline TA/TO | Resort Hotel | 7472 | 74.66 | 15.23 |
| Direct | Resort Hotel | 6513 | 111.67 | 13.48 |
| Direct | City Hotel | 6093 | 119.48 | 17.33 |
| Groups | Resort Hotel | 5836 | 66.45 | 42.39 |
| Corporate | City Hotel | 2986 | 83.12 | 21.47 |
| Corporate | Resort Hotel | 2309 | 51.56 | 15.20 |
| Complementary | City Hotel | 542 | 2.60 | 11.81 |
| Aviation | City Hotel | 237 | 100.14 | 21.94 |
| Complementary | Resort Hotel | 201 | 3.66 | 16.42 |
| Undefined | City Hotel | 2 | 15.00 | 100.00 |
# Calcular métricas clave
insights <- list(
total_bookings = nrow(df_csv),
cancellation_rate = mean(df_csv$is_canceled) * 100,
avg_lead_time = mean(df_csv$lead_time),
avg_adr = mean(df_csv$adr, na.rm = TRUE),
most_common_country = names(sort(table(df_csv$country), decreasing = TRUE))[1],
peak_month = names(sort(table(df_csv$arrival_date_month), decreasing = TRUE))[1]
)
# Crear tabla de insights
insights_df <- data.frame(
Métrica = c("Total de Reservas", "Tasa de Cancelación (%)",
"Lead Time Promedio (días)", "ADR Promedio (€)",
"País Más Común", "Mes Pico"),
Valor = c(
format(insights$total_bookings, big.mark = ","),
round(insights$cancellation_rate, 2),
round(insights$avg_lead_time, 0),
round(insights$avg_adr, 2),
insights$most_common_country,
insights$peak_month
)
)
kable(insights_df, caption = "Métricas Clave del Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)
| Métrica | Valor |
|---|---|
| Total de Reservas | 119,390 |
| Tasa de Cancelación (%) | 37.04 |
| Lead Time Promedio (días) | 104 |
| ADR Promedio (€) | 101.83 |
| País Más Común | PRT |
| Mes Pico | August |
Cancelaciones:
Estacionalidad:
Ingresos:
El análisis del dataset de reservas hoteleras revela patrones complejos en el comportamiento de los clientes y oportunidades claras para optimizar la gestión hotelera:
Gestión de Cancelaciones: Las cancelaciones representan un desafío significativo, especialmente para reservas con lead times prolongados. La implementación de políticas de depósito podría mitigar este riesgo.
Optimización de Precios: La estacionalidad marcada sugiere oportunidades para implementar estrategias de revenue management más sofisticadas.
Segmentación de Mercado: La diversidad de segmentos y orígenes geográficos requiere estrategias de marketing diferenciadas.
Nota: Este documento ha sido generado de forma reproducible. Todas las visualizaciones son interactivas y pueden explorarse en el formato HTML.
cat("Información de la sesión:\n")
## Información de la sesión:
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Spanish_Spain.utf8 LC_CTYPE=Spanish_Spain.utf8
## [3] LC_MONETARY=Spanish_Spain.utf8 LC_NUMERIC=C
## [5] LC_TIME=Spanish_Spain.utf8
##
## time zone: Europe/Madrid
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] viridis_0.6.5 viridisLite_0.4.2 gridExtra_2.3 scales_1.4.0
## [5] kableExtra_1.4.0 knitr_1.50 DT_0.34.0 plotly_4.11.0
## [9] lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0 dplyr_1.1.4
## [13] purrr_1.2.0 readr_2.1.6 tidyr_1.3.1 tibble_3.3.0
## [17] tidyverse_2.0.0 ggstatsplot_0.13.3 fitdistrplus_1.2-4 survival_3.8-3
## [21] MASS_7.3-65 ggmosaic_0.4.0 ggplot2_4.0.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.54 bslib_0.9.0
## [4] bayestestR_0.17.0 htmlwidgets_1.6.4 insight_1.4.2
## [7] ggrepel_0.9.6 lattice_0.22-7 tzdb_0.5.0
## [10] paletteer_1.6.0 crosstalk_1.2.2 vctrs_0.6.5
## [13] tools_4.5.1 generics_0.1.4 datawizard_1.3.0
## [16] pkgconfig_2.0.3 Matrix_1.7-3 data.table_1.17.8
## [19] RColorBrewer_1.1-3 correlation_0.8.8 S7_0.2.1
## [22] RcppParallel_5.1.11-1 lifecycle_1.0.4 compiler_4.5.1
## [25] farver_2.1.2 textshaping_1.0.4 codetools_0.2-20
## [28] htmltools_0.5.8.1 sass_0.4.10 yaml_2.3.10
## [31] lazyeval_0.2.2 pillar_1.11.1 jquerylib_0.1.4
## [34] cachem_1.1.0 statsExpressions_1.7.1 tidyselect_1.2.1
## [37] digest_0.6.38 stringi_1.8.7 rematch2_2.1.2
## [40] labeling_0.4.3 splines_4.5.1 fastmap_1.2.0
## [43] grid_4.5.1 cli_3.6.5 magrittr_2.0.4
## [46] patchwork_1.3.2 productplots_0.1.2 withr_3.0.2
## [49] timechange_0.3.0 rmarkdown_2.30 httr_1.4.7
## [52] hms_1.1.4 evaluate_1.0.5 parameters_0.28.2
## [55] rstantools_2.5.0 rlang_1.1.6 Rcpp_1.1.0
## [58] zeallot_0.2.0 glue_1.8.0 xml2_1.4.1
## [61] svglite_2.2.2 rstudioapi_0.17.1 jsonlite_2.0.0
## [64] effectsize_1.0.1 R6_2.6.1 plyr_1.8.9
## [67] systemfonts_1.3.1